MidTerm Telecom Customer Churn Analysis Emma Horton – Data Visualization Midterm Project

Libraries Used:

library(ggplot2)
library(dplyr)
library(plotly)
library(lsr)
library(broom)
library(htmlwidgets)

Data Loading

telco <- read.csv("Telco-Customer-Churn.csv", na.strings = c(""))

#str(telco)
#summary(telco)
#head(telco, 5)

Data Cleaning

# SeniorCitizen to factor
telco$SeniorCitizen <- factor(telco$SeniorCitizen, levels=c(0,1), labels=c("No", "Yes"))

# Character columns to factors
cat_cols <- c("gender","Partner","Dependents","PhoneService",
              "MultipleLines","InternetService","OnlineSecurity","OnlineBackup",
              "DeviceProtection","TechSupport","StreamingTV","StreamingMovies",
              "Contract","PaperlessBilling","PaymentMethod","Churn")

telco[cat_cols] <- lapply(telco[cat_cols], factor)

# Fix TotalCharges: convert to numeric
telco$TotalCharges <- as.numeric(as.character(telco$TotalCharges))

# Replace "No internet service" and "No phone service" with "No", replace "No phone service" with "No"
no_internet_cols <- c("OnlineSecurity","OnlineBackup","DeviceProtection",
                      "TechSupport","StreamingTV","StreamingMovies")
for(col in no_internet_cols) {
    telco[[col]] <- factor(ifelse(telco[[col]]=="No internet service", "No", 
                                  as.character(telco[[col]])))
}
telco$MultipleLines <- factor(ifelse(telco$MultipleLines=="No phone service", "No", 
                                     as.character(telco$MultipleLines)))
# Drop customerID
telco$customerID <- NULL

# Verify results
#sum(is.na(telco$TotalCharges))  
#levels(telco$MultipleLines)        
#levels(telco$OnlineSecurity)       

Set Color Pallet

project_colors <- c(
  "Yes" = "#E74C3C",    
  "No" = "#3498DB",         
  "Female" = "#9B59B6",    
  "Male" = "#1ABC9C",     
  "Month-to-month" = "#F39C12",  
  "One year" = "#2ECC71",       
  "Two year" = "#34495E",       
  "DSL" = "#16A085",        
  "Fiber optic" = "#D35400",
  "No" = "#95A5A6"          
)

Exploratory Analysis Distribution of a single categorical variable 1. Categorical Variable Distribution – Contract Type vs Churn (Bar Chart) A key categorical variable is the Contract type. I suspect contract length has an effect on churn. I will plot the count of customers by contract type, and distinguish churn vs not churn with color.

contract_churn_data <- telco %>%
  group_by(Contract, Churn) %>%
  summarise(count = n(), .groups = 'drop')

plot_ly(contract_churn_data,
        x = ~Contract,
        y = ~count,
        color = ~Churn,
        colors = project_colors,
        type = "bar") %>%
  layout(
    barmode = "group",
    title = list(text = "Customer Count by Contract Type and Churn Status", x = 0.5),
    xaxis = list(title = "Contract Type"),
    yaxis = list(title = "Number of Customers"),
    legend = list(title = list(text = "Churn"))
  )
## Warning: Duplicate levels detected
## Duplicate levels detected

This is a strong indicator that contract length is related to churn. I will perform a deeper dive into the relationship

# Cross-tab Churn by Contract
table(telco$Churn, telco$Contract)
##      
##       Month-to-month One year Two year
##   No            2220     1307     1647
##   Yes           1655      166       48
# Chi-square
chisq_test <- chisq.test(table(telco$Churn, telco$Contract))
chisq_test
## 
##  Pearson's Chi-squared test
## 
## data:  table(telco$Churn, telco$Contract)
## X-squared = 1184.6, df = 2, p-value < 2.2e-16
# mosaic 
mosaicplot(table(telco$Contract, telco$Churn),
           color = c(project_colors["No"], project_colors["Yes"]),
           main = "Churn vs Contract Type",
           xlab = "Contract Type", ylab = "Churn")

Distribution of a single quantitative variable 2. Numerical Variable Distribution – Tenure (Histogram) I want to explore the distribution of tenure (how long customers have been with the company, in months). A histogram will show the frequency of customers by their tenure.

ggplot(telco, aes(x = tenure)) +
  geom_histogram(binwidth = 5, fill=project_colors["No"], color="black") +
  labs(title="Distribution of Customer Tenure",
       x="Tenure (months)", y="Number of Customers") +
  theme_minimal()

I want to explore the representation of churn status in regards to tenure, faceting on contract type.

ggplot(telco, aes(x = tenure, fill = Churn)) +
  geom_histogram(binwidth = 5, color = "black", position = "stack") +
  facet_wrap(~ Contract) +
  scale_fill_manual(values = project_colors) +
  labs(
    title = "Distribution of Customer Tenure by Contract Type and Churn Status",
    x = "Tenure (months)",
    y = "Number of Customers",
    fill = "Churn"
  ) +
  theme_minimal()

Distribution of two categorical variables 3. Two Categorical Variable Distribution – Payment Method vs. Churn (Grouped Bar) I want to explore the relationship between payment method (bank transfer, credit card, electronic / mailed check) and churn. A grouped bar will display the relationship between payment method and churn.

payment_churn_data <- telco %>%
  group_by(PaymentMethod, Churn) %>%
  summarise(count = n(), .groups = "drop")

# Remove "(automatic)" from PaymentMethod labels
telco$PaymentMethod <- gsub("\\s*\\(automatic\\)", "", telco$PaymentMethod)


plot_ly(payment_churn_data,
        x = ~PaymentMethod,
        y = ~count,
        color = ~Churn,
        colors = project_colors,
        type = "bar") %>%
  layout(
    barmode = "group",
    title = list(text = "Customer Churn by Payment Method", x = 0.5),
    xaxis = list(title = "Payment Method", tickangle = 30),
    yaxis = list(title = "Number of Customers"),
    legend = list(title = list(text = "Churn"))
  )
## Warning: Duplicate levels detected
## Duplicate levels detected

I want to explore how contract type will impact the distribution

ggplot(telco, aes(x = PaymentMethod, fill = Churn)) +
  geom_bar(position = "dodge") +
  facet_wrap(~ Contract) +
  scale_fill_manual(values = project_colors) +
  labs(
    title = "Churn Distribution by Payment Method and Contract Type",
    x = "Payment Method",
    y = "Customer Count",
    fill = "Churn"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))

4. Quantitative Variable Across Categories – Monthly Charges by Contract Type (Boxplot) I want explore how monthly charges differ by contract type.

ggplot(telco, aes(x = Contract, y = MonthlyCharges, fill = Contract)) +
  geom_boxplot() +
  scale_fill_manual(values = project_colors) +
  labs(
    title = "Monthly Charges by Contract Type",
    x = "Contract Type",
    y = "Monthly Charges (USD)"
  ) +
  theme_minimal()

Instead of mucking around to figure out the reason why I suspect a quick regression will give me more answers in less time.

Quick regression

reg_data <- telco %>%
  select(-TotalCharges) %>%
  na.omit()  

reg_data <- reg_data %>%
  mutate(across(where(is.character), as.factor))

model <- lm(MonthlyCharges ~ ., data = reg_data)

summary(model)
## 
## Call:
## lm(formula = MonthlyCharges ~ ., data = reg_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2285 -0.6140 -0.0057  0.6070  4.8419 
## 
## Coefficients:
##                                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                    2.498e+01  6.001e-02  416.193   <2e-16 ***
## genderMale                     2.336e-02  2.448e-02    0.954    0.340    
## SeniorCitizenYes               1.511e-02  3.565e-02    0.424    0.672    
## PartnerYes                    -3.945e-02  2.958e-02   -1.334    0.182    
## DependentsYes                  1.287e-02  3.140e-02    0.410    0.682    
## tenure                         5.209e-06  8.443e-04    0.006    0.995    
## PhoneServiceYes                2.005e+01  4.816e-02  416.349   <2e-16 ***
## MultipleLinesYes               5.018e+00  2.955e-02  169.829   <2e-16 ***
## InternetServiceFiber optic     2.496e+01  3.518e-02  709.555   <2e-16 ***
## InternetServiceNo             -2.505e+01  4.893e-02 -511.848   <2e-16 ***
## OnlineSecurityYes              5.014e+00  3.224e-02  155.490   <2e-16 ***
## OnlineBackupYes                4.992e+00  3.025e-02  165.060   <2e-16 ***
## DeviceProtectionYes            5.022e+00  3.133e-02  160.289   <2e-16 ***
## TechSupportYes                 5.030e+00  3.287e-02  153.037   <2e-16 ***
## StreamingTVYes                 9.974e+00  3.207e-02  311.040   <2e-16 ***
## StreamingMoviesYes             9.967e+00  3.209e-02  310.579   <2e-16 ***
## ContractOne year               7.762e-03  3.844e-02    0.202    0.840    
## ContractTwo year              -2.599e-02  4.629e-02   -0.562    0.574    
## PaperlessBillingYes           -2.039e-02  2.739e-02   -0.744    0.457    
## PaymentMethodCredit card       9.687e-04  3.711e-02    0.026    0.979    
## PaymentMethodElectronic check -1.768e-02  3.644e-02   -0.485    0.628    
## PaymentMethodMailed check     -1.403e-02  3.949e-02   -0.355    0.722    
## ChurnYes                      -2.184e-02  3.262e-02   -0.670    0.503    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.026 on 7020 degrees of freedom
## Multiple R-squared:  0.9988, Adjusted R-squared:  0.9988 
## F-statistic: 2.75e+05 on 22 and 7020 DF,  p-value: < 2.2e-16

To display the regression, I will display a coefficient Plot Bar Chart of Estimates

coeffs <- broom::tidy(model) %>%
  filter(term != "(Intercept)") %>%
  arrange(desc(estimate))

ggplot(coeffs, aes(x = reorder(term, estimate), y = estimate)) +
geom_col(fill = project_colors["Yes"]) +
  coord_flip() +
  labs(
    title = "Regression Coefficients: Impact on Monthly Charges",
    x = "Predictor",
    y = "Estimated Impact ($)"
  ) +
  theme_minimal()

ggplot(telco, aes(x = InternetService, y = MonthlyCharges, fill = InternetService)) +
  geom_boxplot() +
  scale_fill_manual(values = c(
    "DSL" = "#16A085",
    "Fiber optic" = "#D35400",
    "No" = "#95A5A6"
  )) +
  labs(
    title = "Monthly Charges by Internet Service Type",
    x = "Internet Service",
    y = "Monthly Charges (USD)"
  ) +
  theme_minimal()

Monthly Charges by Streaming TV Usage

ggplot(telco, aes(x = StreamingTV, y = MonthlyCharges, fill = StreamingTV)) +
  geom_boxplot() +
  scale_fill_manual(values = c("Yes" = "#E74C3C", "No" = "#3498DB")) +
  labs(
    title = "Monthly Charges by Streaming TV Subscription",
    x = "Streaming TV",
    y = "Monthly Charges (USD)"
  ) +
  theme_minimal()

Monthly Charges by Tech Support

ggplot(telco, aes(x = TechSupport, y = MonthlyCharges, fill = TechSupport)) +
  geom_boxplot() +
  scale_fill_manual(values = c("Yes" = "#E74C3C", "No" = "#3498DB")) +
  labs(
    title = "Monthly Charges by Tech Support Access",
    x = "Tech Support",
    y = "Monthly Charges (USD)"
  ) +
  theme_minimal()

Monthly Charges by Paperless Billing

ggplot(telco, aes(x = PaperlessBilling, y = MonthlyCharges, fill = PaperlessBilling)) +
  geom_boxplot() +
  scale_fill_manual(values = c("Yes" = "#E74C3C", "No" = "#3498DB")) +
  labs(
    title = "Monthly Charges by Paperless Billing Status",
    x = "Paperless Billing",
    y = "Monthly Charges (USD)"
  ) +
  theme_minimal()

5. Relationship between two quantitative variables Relationship Between Two Variables – Tenure vs Monthly Charges (Scatter Plot) Now I explore how a numeric variable relates to churn. Plotting MonthlyCharges against tenure, and using color to show churn status, can reveal if churners cluster in some region.

telco$Churn <- factor(trimws(telco$Churn), levels = c("No", "Yes"))

ggplot(telco, aes(x = tenure, y = MonthlyCharges, color = Churn)) +
  geom_point(alpha = 0.6) +
  labs(title = "Monthly Charges vs Tenure, by Churn Status",
       x = "Tenure (months)", y = "Monthly Charges (USD)") +
  scale_color_manual(values = project_colors[c("No", "Yes")]) +
  theme_minimal()

Create a chart not directly discussed in class (e.g. Heatmap, Radar, Contour, Sunburst) 6. Heatmap – Churn Rate by Contract and Internet Service Finally, I wanted to create a heatmap to visualize churn rates across two categorical dimensions: Contract type and InternetService type. This will highlight combinations (e.g. Fiber optic + month-to-month) that have especially high churn.

churn_rate <- telco %>%
  group_by(Contract, InternetService) %>%
  summarize(churn_pct = mean(Churn == "Yes") * 100)
## `summarise()` has grouped output by 'Contract'. You can override using the
## `.groups` argument.
ggplot(churn_rate, aes(x = Contract, y = InternetService, fill = churn_pct)) +
  geom_tile(color="white") +
  scale_fill_gradient(low="lightyellow", high="red", name="Churn Rate (%)") +
  labs(title="Churn Rate Heatmap: Contract vs Internet Service", 
       x="Contract Type", y="Internet Service") +
  theme_minimal()

7. Bubble Plot – Customer Charges Over Time by Contract Type I wanted to highlight how customers on month-to-month contracts often accumulate high charges earlier, while those on longer contracts tend to accrue charges more gradually.

Key Churn point

churn_by_tenure <- telco %>%
  group_by(tenure) %>%
  summarize(
    churn_rate = mean(Churn == "Yes")
  )

ggplot(churn_by_tenure, aes(x = tenure, y = churn_rate)) +
  geom_line(color = "steelblue") +
  geom_point() +
  labs(
    title = "Churn Rate by Tenure",
    x = "Tenure (months)",
    y = "Churn Rate"
  )

churn_by_tenure <- telco %>%
  group_by(tenure) %>%
  summarize(
    churn_rate = mean(Churn == "Yes")
  )

inflection <- ggplot(churn_by_tenure, aes(x = tenure, y = churn_rate)) +
  geom_smooth(method = "loess", span = 0.2, se = FALSE, color = "darkred") +
  labs(
    title = "Smoothed Churn Rate by Tenure",
    x = "Tenure (months)",
    y = "Churn Rate"
  )

plot <- ggplotly(inflection)
## `geom_smooth()` using formula = 'y ~ x'
inflection
## `geom_smooth()` using formula = 'y ~ x'

saveWidget(plot, "inflection.html", selfcontained = TRUE)